Katerin Bejarano - Rubén Castillo
# Librerías
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
import pandas_profiling
from scipy.stats import ttest_ind
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
# Cargar los datos
data= pd.read_csv('UCI_Credit_Card.csv',
sep=',',
encoding='latin-1'
)
data.head(5)
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default.payment.next.month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 20000.0 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 689.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 |
| 1 | 2 | 120000.0 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | ... | 3272.0 | 3455.0 | 3261.0 | 0.0 | 1000.0 | 1000.0 | 1000.0 | 0.0 | 2000.0 | 1 |
| 2 | 3 | 90000.0 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 0 | ... | 14331.0 | 14948.0 | 15549.0 | 1518.0 | 1500.0 | 1000.0 | 1000.0 | 1000.0 | 5000.0 | 0 |
| 3 | 4 | 50000.0 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 0 | ... | 28314.0 | 28959.0 | 29547.0 | 2000.0 | 2019.0 | 1200.0 | 1100.0 | 1069.0 | 1000.0 | 0 |
| 4 | 5 | 50000.0 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 0 | ... | 20940.0 | 19146.0 | 19131.0 | 2000.0 | 36681.0 | 10000.0 | 9000.0 | 689.0 | 679.0 | 0 |
5 rows × 25 columns
# Perfilamiento de datos.
perfilamiento=pandas_profiling.ProfileReport(data, minimal=True)
# Perfilamiento de los datos en formato html
perfilamiento.to_file("Perfil de datos clientes.html")
# Gráfico de correlación de variables
corr = data.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
plt.show()
## Estadísticas básicas a variables numèricas
data[['AGE','LIMIT_BAL','BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6','PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']].describe()
| AGE | LIMIT_BAL | BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | 3.000000e+04 | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | 3.000000e+04 | 30000.00000 | 30000.000000 | 30000.000000 | 30000.000000 |
| mean | 35.485500 | 167484.322667 | 51223.330900 | 49179.075167 | 4.701315e+04 | 43262.948967 | 40311.400967 | 38871.760400 | 5663.580500 | 5.921163e+03 | 5225.68150 | 4826.076867 | 4799.387633 | 5215.502567 |
| std | 9.217904 | 129747.661567 | 73635.860576 | 71173.768783 | 6.934939e+04 | 64332.856134 | 60797.155770 | 59554.107537 | 16563.280354 | 2.304087e+04 | 17606.96147 | 15666.159744 | 15278.305679 | 17777.465775 |
| min | 21.000000 | 10000.000000 | -165580.000000 | -69777.000000 | -1.572640e+05 | -170000.000000 | -81334.000000 | -339603.000000 | 0.000000 | 0.000000e+00 | 0.00000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 28.000000 | 50000.000000 | 3558.750000 | 2984.750000 | 2.666250e+03 | 2326.750000 | 1763.000000 | 1256.000000 | 1000.000000 | 8.330000e+02 | 390.00000 | 296.000000 | 252.500000 | 117.750000 |
| 50% | 34.000000 | 140000.000000 | 22381.500000 | 21200.000000 | 2.008850e+04 | 19052.000000 | 18104.500000 | 17071.000000 | 2100.000000 | 2.009000e+03 | 1800.00000 | 1500.000000 | 1500.000000 | 1500.000000 |
| 75% | 41.000000 | 240000.000000 | 67091.000000 | 64006.250000 | 6.016475e+04 | 54506.000000 | 50190.500000 | 49198.250000 | 5006.000000 | 5.000000e+03 | 4505.00000 | 4013.250000 | 4031.500000 | 4000.000000 |
| max | 79.000000 | 1000000.000000 | 964511.000000 | 983931.000000 | 1.664089e+06 | 891586.000000 | 927171.000000 | 961664.000000 | 873552.000000 | 1.684259e+06 | 896040.00000 | 621000.000000 | 426529.000000 | 528666.000000 |
# Función de decodificar sexo
def cat_SEX(SEX):
if SEX==1: val="Hombre"
elif SEX==2: val="Mujer"
elif SEX==0: val="Sin información"
return val
# Aplicamos la función a la columna correspondiente
data['SEX']=data['SEX'].apply(cat_SEX)
# Frecuencia absoluta
frec_data=data['SEX'].value_counts().reset_index()
frec_data.head(3)
| index | SEX | |
|---|---|---|
| 0 | Mujer | 18112 |
| 1 | Hombre | 11888 |
# Función de decodificar Educación
def cat_EDUCATION(EDUCATION):
if EDUCATION==1: val="Primaria"
elif EDUCATION==2: val="Universitario"
elif EDUCATION==3: val="Bachiller"
elif EDUCATION==4: val="Otros"
elif EDUCATION==5: val="No se sabe"
elif EDUCATION==6: val="No se sabe"
elif EDUCATION==0: val="No se sabe"
return val
# Aplicamos la función a la columna correspondiente
data['EDUCATION']=data['EDUCATION'].apply(cat_EDUCATION)
# Frecuencia absoluta
frec_data=data['EDUCATION'].value_counts().reset_index()
frec_data.head(8)
| index | EDUCATION | |
|---|---|---|
| 0 | Universitario | 14030 |
| 1 | Primaria | 10585 |
| 2 | Bachiller | 4917 |
| 3 | No se sabe | 345 |
| 4 | Otros | 123 |
# Función de decodificar Estado civil
def cat_MARRIAGE(MARRIAGE):
if MARRIAGE==1: val="Casado"
elif MARRIAGE==2: val="Soltero"
elif MARRIAGE==3: val="Otro"
elif MARRIAGE==0: val="Sin información"
return val
# Aplicamos la función a la columna correspondiente
data['MARRIAGE']=data['MARRIAGE'].apply(cat_MARRIAGE)
# Frecuencia absoluta
frec_data=data['MARRIAGE'].value_counts().reset_index()
frec_data.head(5)
| index | MARRIAGE | |
|---|---|---|
| 0 | Soltero | 15964 |
| 1 | Casado | 13659 |
| 2 | Otro | 323 |
| 3 | Sin información | 54 |
# Función de condiciones Edad
conditions = [
(data['AGE'] <= 19),
(data['AGE'] >= 20) & (data['AGE'] < 31),
(data['AGE'] >= 31) & (data['AGE'] < 41),
(data['AGE'] >= 41) & (data['AGE'] < 51),
(data['AGE'] >= 51) & (data['AGE'] < 61),
(data['AGE'] >= 61)
]
# Lista de valores de las condiciones
values = ['Menores de 20','20 a 30', '30 a 40','40 a 50', '50 a 60', 'Mayores de 60']
# Nueva columna de rangos para Edad
data['AGE1'] = np.select(conditions, values)
# Frecuencia absoluta
frec_data=data['AGE1'].value_counts().reset_index()
frec_data.head(10)
| index | AGE1 | |
|---|---|---|
| 0 | 20 a 30 | 11013 |
| 1 | 30 a 40 | 10713 |
| 2 | 40 a 50 | 6005 |
| 3 | 50 a 60 | 1997 |
| 4 | Mayores de 60 | 272 |
# Función de decodificar Desembolso
conditions = [
(data['LIMIT_BAL'] < 50000),
(data['LIMIT_BAL'] >= 50000) & (data['LIMIT_BAL'] < 100000),
(data['LIMIT_BAL'] >= 100000) & (data['LIMIT_BAL'] < 150000),
(data['LIMIT_BAL'] >= 150000) & (data['LIMIT_BAL'] < 200000),
(data['LIMIT_BAL'] >= 200000) & (data['LIMIT_BAL'] < 250000),
(data['LIMIT_BAL'] >= 250000) & (data['LIMIT_BAL'] < 300000),
(data['LIMIT_BAL'] >= 300000) & (data['LIMIT_BAL'] < 350000),
(data['LIMIT_BAL'] >= 350000) & (data['LIMIT_BAL'] < 400000),
(data['LIMIT_BAL'] >= 400000)
]
# Lista de valores de las condiciones Desembolso
values = ['Menor a 50', '50 a 100', '100 a 150', '150 a 200', '200 a 250', '250 a 300', '300 a 350', '350 a 400', 'Mayor igual a 400']
# Nueva columna de rangos para el Desembolso
data['LIMIT_BAL1'] = np.select(conditions, values)
# Frecuencia absoluta
frec_data=data['LIMIT_BAL1'].value_counts().reset_index()
frec_data.head(10)
| index | LIMIT_BAL1 | |
|---|---|---|
| 0 | 50 a 100 | 7139 |
| 1 | Menor a 50 | 4311 |
| 2 | 200 a 250 | 4083 |
| 3 | 100 a 150 | 3840 |
| 4 | 150 a 200 | 3560 |
| 5 | Mayor igual a 400 | 2075 |
| 6 | 250 a 300 | 1950 |
| 7 | 300 a 350 | 1529 |
| 8 | 350 a 400 | 1513 |
# Función de decodificar
def cat_PAY(PAY):
if PAY==-1: val="Al dia"
elif PAY==0: val="Sin informacion"
elif PAY==1: val="1 mes de mora"
elif PAY==2: val="2 meses de mora"
elif PAY==3: val="3 meses de mora"
elif PAY==4: val="4 meses de mora"
elif PAY==5: val="5 meses de mora"
elif PAY==6: val="6 meses de mora"
elif PAY==7: val="7 meses de mora"
elif PAY==8: val="8 meses de mora"
elif PAY==-2: val="Dato Invalido"
return val
# Aplicamos la función a la columna correspondiente
data['PAY_01']=data['PAY_0'].apply(cat_PAY)
data['PAY_21']=data['PAY_2'].apply(cat_PAY)
data['PAY_31']=data['PAY_3'].apply(cat_PAY)
data['PAY_41']=data['PAY_4'].apply(cat_PAY)
data['PAY_51']=data['PAY_5'].apply(cat_PAY)
data['PAY_61']=data['PAY_6'].apply(cat_PAY)
# Frecuencia absoluta
frec_sep=data['PAY_01'].value_counts().reset_index()
frec_ago=data['PAY_21'].value_counts().reset_index()
frec_jul=data['PAY_31'].value_counts().reset_index()
frec_jun=data['PAY_41'].value_counts().reset_index()
frec_may=data['PAY_51'].value_counts().reset_index()
frec_abr=data['PAY_61'].value_counts().reset_index()
# Crea campo Mes
frec_sep=frec_sep.rename(columns={'PAY_01': 'PAY'})
frec_sep['MES']='SEPTIEMBRE'
frec_ago=frec_ago.rename(columns={'PAY_21': 'PAY'})
frec_ago['MES']='AGOSTO'
frec_jul=frec_jul.rename(columns={'PAY_31': 'PAY'})
frec_jul['MES']='JULIO'
frec_jun=frec_jun.rename(columns={'PAY_41': 'PAY'})
frec_jun['MES']='JUNIO'
frec_may=frec_may.rename(columns={'PAY_51': 'PAY'})
frec_may['MES']='MAYO'
frec_abr=frec_abr.rename(columns={'PAY_61': 'PAY'})
frec_abr['MES']='ABRIL'
# Crea la tabla de PAY0 a PAY 6
pay_row = pd.concat([frec_sep, frec_ago,frec_jul,frec_jun,frec_may,frec_abr])
#Gráfico de barras
pay_row1=pay_row.groupby(['MES','index','PAY']).sum().reset_index()
pay_row1.columns=['MES','index','PAY']
fig = px.bar(pay_row1, x="index", y="PAY", color="MES", title="Gráfico de Moras por Mes")
fig.show()
# Quantile Desembolso
data['LIMIT_BAL'].quantile([0.10,0.25,0.50,0.75,0.90])
0.10 30000.0 0.25 50000.0 0.50 140000.0 0.75 240000.0 0.90 360000.0 Name: LIMIT_BAL, dtype: float64
# Desembolso
fig = sm.qqplot(data['LIMIT_BAL'], stats.t, fit=True, line="45")
plt.show()
C:\Users\Kata\anaconda3\envs\cda_talleres\lib\site-packages\statsmodels\graphics\gofplots.py:993: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "bo" (-> marker='o'). The keyword argument will take precedence.
# Grafico de barras Desembolso y Edad
frec_data=data.groupby(['LIMIT_BAL1','AGE']).count().reset_index()[['LIMIT_BAL1','AGE','ID']]
frec_data.columns=['LIMIT_BAL1','AGE','conteo']
fig = px.bar(frec_data, x="LIMIT_BAL1", y="conteo", color="AGE", title="Gráfico de Desembolso y Edad")
fig.show()
# Grafico de barras Desembolso y Sexo
frec_data=data.groupby(['LIMIT_BAL1','SEX']).count().reset_index()[['LIMIT_BAL1','SEX','ID']]
frec_data.columns=['LIMIT_BAL1','SEX','conteo']
fig = px.bar(frec_data, x="LIMIT_BAL1", y="conteo", color="SEX", title="Gráfico de Desembolso y Sexo")
fig.show()
# Grafico de barras Edad y Sexo
frec_data=data.groupby(['AGE','SEX']).count().reset_index()[['AGE','SEX','ID']]
frec_data.columns=['AGE','SEX','conteo']
fig = px.bar(frec_data, x="AGE", y="conteo", color="SEX", title="Gráfico de Edad y Sexo")
fig.show()
# Gráfico de dispersión Edad, Desembolso y Sexo
fig = px.scatter(data, x="AGE", y="LIMIT_BAL",color='SEX')
fig.show()
# Histogramas de Bill
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['BILL_AMT1']))
fig.add_trace(go.Histogram(x=data['BILL_AMT2']))
fig.add_trace(go.Histogram(x=data['BILL_AMT3']))
fig.add_trace(go.Histogram(x=data['BILL_AMT4']))
fig.add_trace(go.Histogram(x=data['BILL_AMT5']))
fig.add_trace(go.Histogram(x=data['BILL_AMT6']))
# Graficar
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()
# Histogramas de Pagos
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['PAY_AMT1']))
fig.add_trace(go.Histogram(x=data['PAY_AMT2']))
fig.add_trace(go.Histogram(x=data['PAY_AMT3']))
fig.add_trace(go.Histogram(x=data['PAY_AMT4']))
fig.add_trace(go.Histogram(x=data['PAY_AMT5']))
fig.add_trace(go.Histogram(x=data['PAY_AMT6']))
# Graficar
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()
## Diagrama box plot Desembolso
fig = px.box(data, y="LIMIT_BAL")
fig.show()
PRUEBAS
# t-test